################################################################################
################################################################################
################################################################################


#### MrP - joint or marginals?

####	Lucas Leemann, lleemann@gmail.com
####	January 2015 




################################################################################
################################################################################
################################################################################
rm(list=ls())
library(MASS)
library(lme4)
library(blme)

################################################################################
#		It is assumed that the sample has 1,000 observations and there are 25
#		units. The first ten have each 70 respondents and the latter 15 only 20.
# 		Finally, the units do differ in there socio-economic structure (gammas
#		are NOT equally distributed within units). The population is only sampled
#		for gamma, whereas the level 2 stuff is added for each simulation
################################################################################

set.seed(2222)



start.time <-  Sys.time()

	###############################
	# generating Sample according to same distributions as Population	
	# independent draws

N.sim <- 100
pred.j <- matrix(NA,N.sim,25)
pred.m <- matrix(NA,N.sim,25)
pred.d <- matrix(NA,N.sim,25)
out.T <- matrix(NA,N.sim,25)
sq.err.joint.box <- rep(NA, N.sim)
sq.err.margi.box <- rep(NA, N.sim)
sq.err.dis.box <- rep(NA, N.sim)

unit.error.joint <- matrix(NA,N.sim,25)
unit.error.margi <- matrix(NA,N.sim,25)
unit.error.dis <- matrix(NA,N.sim,25)

load("draws_corr 0.6.RData")

	
mat.pop <- independent.draws[[3]]
summary(mat.pop)

true.out <- rep(NA,25)
	for (w in 1:25){
	true.out[w] <- mean(mat.pop[mat.pop[,2]==w,9])
	}
	
	raster <- 	rbind(c(1,1,1), c(1,1,2), c(1,1,3), c(1,1,4), c(1,2,1), c(1,2,2),c(1,2,3), c(1,2,4), c(1,3,1), c(1,3,2), c(1,3,3), c(1,3,4), c(1,4,1), c(1,4,2), c(1,4,3), c(1,4,4), c(2,1,1), c(2,1,2), c(2,1,3), c(2,1,4), c(2,2,1), c(2,2,2), c(2,2,3), c(2,2,4), c(2,3,1), c(2,3,2), c(2,3,3), c(2,3,4), c(2,4,1), c(2,4,2), c(2,4,3), c(2,4,4), c(3,1,1), c(3,1,2), c(3,1,3), c(3,1,4), c(3,2,1), c(3,2,2), c(3,2,3), c(3,2,4), c(3,3,1), c(3,3,2), c(3,3,3), c(3,3,4), c(3,4,1), c(3,4,2), c(3,4,3), c(3,4,4),c(4,1,1), c(4,1,2), c(4,1,3), c(4,1,4), c(4,2,1), c(4,2,2), c(4,2,3), c(4,2,4), c(4,3,1), c(4,3,2), c(4,3,3), c(4,3,4), c(4,4,1), c(4,4,2), c(4,4,3), c(4,4,4))

# creating census weighting object based on population data
# "gamma64.weight" is just the product of the three marginals	
	gamma64.weight <- rep(NA,64*25)
	divider <- c(rep(70000,10),rep(20000,15))
	for (z in 1:25){
		a <- (z-1)*64+1
		b <- z*64
		filler <- rep(NA,64)
		counter <- 1
		for (c in 1:64){
					filler[counter] <- c(table(mat.pop[mat.pop[,2]==z,3])/divider[z])[raster[c,1]] * c(table(mat.pop[mat.pop[,2]==z,4])/divider[z])[raster[c,2]] * c(table(mat.pop[mat.pop[,2]==z,5])/divider[z])[raster[c,3]]
					counter <- counter +1
		}
		gamma64.weight[a:b] <- filler
	}
	
# creating census weighting object based on population data
# "census.64" is based on the true joint distribution	
	census.64 <- rep(NA,64*25)
	unit <- mat.pop[,2]
		for (t in 1:25){
			a <- (t-1)*64+1
			b <- t*64
			census.64[a:b] <- rep(0,length(c(1:64)))
			block <- mat.pop[unit==t,6] 
		ww <- length(table(block))
				for (w in 1:ww){
					#census.data[a-1 + as.numeric(names(table(popL)[w]))] <- table(popL)[w]
					census.64[a-1 + as.numeric(names(table(block)[w]))] <- c(table(block)/sum(table(block)))[w]
				}
		}




for (c in 1:N.sim){
	
	N.obs <- 2000
	draws <- sample(c(1:1000000),N.obs)
	sample.d <- mat.pop[draws,]

	x.g1 <- sample.d[,3]
	x.g2 <- sample.d[,4]
	x.g3 <- sample.d[,5]

	x.cont <- seq(-2,2,length.out=25)
	X.CONT <- rep(NA,N.obs)
	for (t in 1:N.obs){
		X.CONT[t] <- x.cont[sample.d[t,2]]
	}

	y.observed <- sample.d[,9]
	placer <- sample.d[,2]
		
	#	model1 <- bglmer(y.observed ~ X.CONT + (1|x.g1)+ (1|x.g2)+ (1|x.g3)+ (1|placer) , cov.prior = invwishart, family = binomial(link = "probit"), control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 100000)))
	model1 <- glmer(y.observed ~ X.CONT + (1|x.g1)+ (1|x.g2)+ (1|x.g3)+ (1|placer) , family = binomial(link = "probit"), control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 100000)))
	
##########################################
### Prediction based on MrP with Joint


	# predict unit-effect
	y.lat.cont <- (cbind(1,x.cont)%*%fixef(model1) + ranef(model1)$placer)
	y.lat.cont <- y.lat.cont[,1]	
	
	# predict individual effect

	# first, generate prediction for each ideal type (64 different types):
	gamma1.pred <- ranef(model1)$x.g1[[1]][raster[,1]]
	gamma2.pred <- ranef(model1)$x.g2[[1]][raster[,2]]
	gamma3.pred <- ranef(model1)$x.g3[[1]][raster[,3]]
	ideal.type.pred <- gamma1.pred + gamma2.pred + gamma3.pred
	
	# create 64*25 prediction vector
	cont.contribution <- rep(NA,25*64)
	for (q in 1:25){
		a <- (q-1)*64+1
		b <- q*64
		cont.contribution[a:b] <- rep(y.lat.cont[q],64)		
		}
	latent.pred <- cont.contribution + rep(ideal.type.pred,25)
	predicted.support <- pnorm(latent.pred)
	
	
	# multiply census.information (for weighting) with predicted support per ideal type
	ppp <- predicted.support * census.64
	unit.l <- c(rep(1,64), rep(2,64), rep(3,64), rep(4,64), rep(5,64), rep(6,64), rep(7,64), rep(8,64), rep(9,64), rep(10,64), rep(11,64), rep(12,64), rep(13,64), rep(14,64), rep(15,64), rep(16,64), rep(17,64), rep(18,64), rep(19,64), rep(20,64), rep(21,64), rep(22,64), rep(23,64), rep(24,64), rep(25,64))
	out.pred.joi <- rep(NA,25)
	for (q in 1:25){		
		out.pred.joi[q] <- sum(ppp[unit.l==q])
	}

	#plot(out.pred.joi, true.out, ylim=c(0,1), xlim=c(0,1))	

##########################################
### Prediction based on MrP with Marginals
		
	# combining context and individuum
	pred.prob <- pnorm(cont.contribution + rep(ideal.type.pred,25))
	ppp2 <- pred.prob*gamma64.weight
unit.64 <- c(rep(1,64), rep(2,64), rep(3,64), rep(4,64), rep(5,64), rep(6,64), rep(7,64), rep(8,64), rep(9,64), rep(10,64), rep(11,64), rep(12,64), rep(13,64), rep(14,64), rep(15,64), rep(16,64), rep(17,64), rep(18,64), rep(19,64), rep(20,64), rep(21,64), rep(22,64), rep(23,64), rep(24,64), rep(25,64))	
out.pred.mar <- rep(NA,25)
	for (q in 1:25){		
		out.pred.mar[q] <- sum(ppp2[unit.64==q])
	}

	#points(out.pred.mar, true.out, ylim=c(0,1), xlim=c(0,1), col="red")	


##########################################
### Prediction based on Disaggregation

	out.dis <- rep(NA,25)
	for (u in 1:25){
		out.dis[u] <- mean(sample.d[sample.d[,2]==u,9])
	}

#### Compare True and Predicted Values

	unit.error.joint[c,] <- out.pred.joi - true.out
	unit.error.margi[c,] <- out.pred.mar - true.out
	unit.error.dis[c,] <- out.dis - true.out
		
	sq.err.joint <- mean((out.pred.joi - true.out)^2)	
	sq.err.margi <- mean((out.pred.mar - true.out)^2)
	sq.err.dis <- mean((out.dis - true.out)^2)
	
	sq.err.joint.box[c] <- sq.err.joint
	sq.err.margi.box[c] <- sq.err.margi	
	sq.err.dis.box[c] <- sq.err.dis
	
	pred.j[c,] <- out.pred.joi
	pred.d[c,] <- out.dis
	pred.m[c,] <- out.pred.mar
	out.T[c,] <- true.out	

	#print(c)

	}
	
	#plot(pred.j,pred.m)
	
	
################################################################################
# analyze disaggregatin error

dis.err <- out.T-pred.d


################################################################################
#max.error.dis <- apply(abs(dis.err),1,max)
#error.order.dis <- order(max.error.dis)
#qq <- length(max.error.dis)

#par(mfrow=c(1,2), family="CMU Serif")
#plot(c(1:qq),(max.error.dis[error.order.dis])^2, ylim=c(0,0.35),ylab="Mean Squared Error",cex=0.6, xlab="Simulations (in increasing order of Disagg. estimation error)")
#points(c(1:qq),sq.err.margi.box[error.order.dis], pch=19, cex=0.6, col="blue")
#points(c(1:qq),sq.err.joint.box[error.order.dis], pch=2, cex=0.6, col="red")
#legend(0.1,0.2,c("MSE of Marginal","MSE of Joint", "MSE of Disaggregation"), pch=c(19,2,1), bty="n", col=c("blue","red","black"))

#	plot(pred.j,pred.m, col=rgb(173,216,230,100, maxColorValue=255), xlab="Unit Estimates from Joint", ylab="Unit Estimates from Marginal", cex=0.5, pch=19)

end.time <-  Sys.time()
cat(paste("Job finished at:",end.time))
dur <- end.time-start.time
cat(paste("Job took:",dur))


sim.2000.0.6rho <- 	list(pred.j, pred.d, pred.m, out.T, sq.err.joint.box, sq.err.margi.box, sq.err.dis.box, 	unit.error.joint, unit.error.margi, unit.error.dis)

save(sim.2000.0.6rho, file="sim.2000.06.B00.RData")